python 微信公众号文章爬取

[python]微信公众号文章爬取

需求

爬取一些微信公众号的文章

数据来源

1.搜狗微信搜索,可以搜索微信公众号文章,但只能显示该公众号最近十篇的文章
2.通过个人微信公众号中的素材管理,查看其他微信公众号文章


步骤

1.手动从网站上获取cookie通过cookie登陆
2.从请求url中获取token
3.拼接参数请求https://mp.weixin.qq.com/cgi-bin/searchbiz获取公众号的fakeid也就是biz
4.拼接参数请求https://mp.weixin.qq.com/cgi-bin/appmsg?获取文章列表信息
5.通过文章url爬取文章

通过这种方式是没办法得到阅读量和点赞数的,因为网页打开公众号文章是没有阅读数和点赞数的

代码

github仓库地址

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import requests
import json
import re
import time

class WeChatCrawler():

def __init__(self, wxList):
self.wxList = wxList
self.cookies = self.__getCookiesFromText()
self.token = self.__getToken()
self.headers = {
"HOST": "mp.weixin.qq.com",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0"
}
self.searchBizParam = {
'action': 'search_biz',
'token': self.token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'query': '',
'begin': '0',
'count': '5',
}
self.getMsgListParam = {
'token': self.token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'action': 'list_ex',
'begin': '0',
'count': '5',
'query': '',
'fakeid': '',
'type': '9'
}

def __getCookiesFromText(self):
# 手动获取cookie
with open('cookie.txt', 'r', encoding='utf-8') as f:
cookieStr = f.read()
# 处理cookieStr格式转化成json
cookieStr = "{\"" + cookieStr + "\"}"
cookieStr = cookieStr.replace("rewardsn=;", "").replace(";", "\",\"").replace("=", "\":\"").replace(
"\":\"\"", "=\"").replace(' ', '')
# print(cookieStr)
cookies = json.loads(cookieStr)
return cookies

def __getToken(self):
url = 'https://mp.weixin.qq.com'
response = requests.get(url=url, cookies=self.cookies)
token = re.findall(r'token=(\d+)', str(response.url))[0]
return token

def __getWXFakeid(self, wx):
searchUrl = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
self.searchBizParam['query'] = wx
searchResponse = requests.get(searchUrl, cookies=self.cookies, headers=self.headers, params=self.searchBizParam)
fakeid = searchResponse.json().get('list')[0].get('fakeid')
return fakeid

def __getWXMsgCnt(self, fakeId):
self.getMsgListParam['fakeid'] = fakeId
appmsgUrl = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
appmsgResponse = requests.get(appmsgUrl, cookies=self.cookies, headers=self.headers,
params=self.getMsgListParam)
wxMsgCnt = appmsgResponse.json().get('app_msg_cnt')
return wxMsgCnt

def __getWXMsgList(self, fakeId):
appmsgUrl = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
wxMsgCnt = self.__getWXMsgCnt(fakeId)
if wxMsgCnt is not None:
pages = int(wxMsgCnt) // 5
begin = 0
for _ in range(pages):
print('====翻页====', begin)
self.getMsgListParam['begin'] = str(begin)
msgListResponse = requests.get(appmsgUrl, cookies=self.cookies, headers=self.headers,
params=self.getMsgListParam)
msgList = msgListResponse.json().get('app_msg_list')
for item in msgList:
# todo more
msgLink = item.get('link')
print(msgLink)
msgTitle = item.get('title')
print(msgTitle)
begin += 5
time.sleep(3)

def runCrawler(self):
fakeIds = list(map(self.__getWXFakeid, self.wxList))
list(map(self.__getWXMsgList, fakeIds))

if __name__ == '__main__':
# example
wxList = ['量子位', ]
wc = WeChatCrawler(wxList)
wc.runCrawler()